From b62e911341c8ec7446378b477c47da4256053dc0 Mon Sep 17 00:00:00 2001
From: Oliver Kiddle <opk@zsh.org>
Date: Sat, 13 May 2023 00:53:32 +0200
Subject: [PATCH] 51723: migrate pcre module to pcre2

---
 ChangeLog          |   3 +
 Src/Modules/pcre.c | 223 ++++++++++++++++++---------------------------
 Test/V07pcre.ztst  |  13 ++-
 configure.ac       |  20 ++--
 4 files changed, 110 insertions(+), 149 deletions(-)

# diff --git a/ChangeLog b/ChangeLog
# index f5c77f801..285b73b2c 100644
# --- a/ChangeLog
# +++ b/ChangeLog
# @@ -1,5 +1,8 @@
#  2023-05-13  Oliver Kiddle  <opk@zsh.org>
 
# +	* 51723: Src/Modules/pcre.c, Test/V07pcre.ztst, configure.ac:
# +	migrate pcre module to pcre2
# +
#  	* Felipe Contreras: 50612: Misc/vcs_info-examples: fix typo
 
#  	* github #98: Vidhan Bhatt: Completion/Darwin/Command/_shortcuts:
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -34,11 +34,11 @@
 #define CPCRE_PLAIN 0
 
 /**/
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#include <pcre.h>
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
 
-static pcre *pcre_pattern;
-static pcre_extra *pcre_hints;
+static pcre2_code *pcre_pattern;
 
 /**/
 static int
@@ -54,8 +54,8 @@ zpcre_utf8_enabled(void)
 	return 0;
 
     if ((have_utf8_pcre == -1) &&
-	(pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre))) {
-	    have_utf8_pcre = -2; /* erk, failed to ask */
+       (pcre2_config(PCRE2_CONFIG_UNICODE, &have_utf8_pcre))) {
+           have_utf8_pcre = -2; /* erk, failed to ask */
     }
 
     return (have_utf8_pcre == 1) && (!strcmp(nl_langinfo(CODESET), "UTF-8"));
@@ -69,47 +69,38 @@ zpcre_utf8_enabled(void)
 static int
 bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
 {
-    int pcre_opts = 0, pcre_errptr, target_len;
-    const char *pcre_error;
+    uint32_t pcre_opts = 0;
+    int target_len;
+    int pcre_error;
+    PCRE2_SIZE pcre_offset;
     char *target;
     
-    if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
-    if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
-    if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
-    if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
-    if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
+    if (OPT_ISSET(ops, 'a')) pcre_opts |= PCRE2_ANCHORED;
+    if (OPT_ISSET(ops, 'i')) pcre_opts |= PCRE2_CASELESS;
+    if (OPT_ISSET(ops, 'm')) pcre_opts |= PCRE2_MULTILINE;
+    if (OPT_ISSET(ops, 'x')) pcre_opts |= PCRE2_EXTENDED;
+    if (OPT_ISSET(ops, 's')) pcre_opts |= PCRE2_DOTALL;
     
     if (zpcre_utf8_enabled())
-	pcre_opts |= PCRE_UTF8;
-
-#ifdef HAVE_PCRE_STUDY
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
-#endif
+	pcre_opts |= PCRE2_UTF;
 
     if (pcre_pattern)
-	pcre_free(pcre_pattern);
+	pcre2_code_free(pcre_pattern);
     pcre_pattern = NULL;
 
     target = ztrdup(*args);
     unmetafy(target, &target_len);
 
-    if ((int)strlen(target) != target_len) {
-	zwarnnam(nam, "embedded NULs in PCRE pattern terminate pattern");
-    }
-
-    pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+    pcre_pattern = pcre2_compile((PCRE2_SPTR) target, (PCRE2_SIZE) target_len,
+	    pcre_opts, &pcre_error, &pcre_offset, NULL);
 
     free(target);
 
     if (pcre_pattern == NULL)
     {
-	zwarnnam(nam, "error in regex: %s", pcre_error);
+	PCRE2_UCHAR buffer[256];
+	pcre2_get_error_message(pcre_error, buffer, sizeof(buffer));
+	zwarnnam(nam, "error in regex: %s", buffer);
 	return 1;
     }
     
@@ -117,67 +108,48 @@ bin_pcre_compile(char *nam, char **args,
 }
 
 /**/
-#ifdef HAVE_PCRE_STUDY
-
-/**/
 static int
 bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int func))
 {
-    const char *pcre_error;
-
     if (pcre_pattern == NULL)
     {
 	zwarnnam(nam, "no pattern has been compiled for study");
 	return 1;
     }
-    
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
 
-    pcre_hints = pcre_study(pcre_pattern, 0, &pcre_error);
-    if (pcre_error != NULL)
-    {
-	zwarnnam(nam, "error while studying regex: %s", pcre_error);
-	return 1;
+    int jit = 0;
+    if (!pcre2_config(PCRE2_CONFIG_JIT, &jit) && jit) {
+	if (pcre2_jit_compile(pcre_pattern, PCRE2_JIT_COMPLETE) < 0) {
+	    zwarnnam(nam, "error while studying regex");
+	    return 1;
+	}
     }
     
     return 0;
 }
 
-/**/
-#else /* !HAVE_PCRE_STUDY */
-
-# define bin_pcre_study bin_notavail
-
-/**/
-#endif /* !HAVE_PCRE_STUDY */
-
-/**/
 static int
-zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
-		     char *substravar, int want_offset_pair, int matchedinarr,
-		     int want_begin_end)
+zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
+	char *matchvar, char *substravar, int want_offset_pair,
+	int matchedinarr, int want_begin_end)
 {
-    char **captures, *match_all, **matches;
+    PCRE2_SIZE *ovec;
+    char *match_all, **matches;
     char offset_all[50];
     int capture_start = 1;
 
     if (matchedinarr) {
-	/* bash-style captures[0] entire-matched string in the array */
+	/* bash-style ovec[0] entire-matched string in the array */
 	capture_start = 0;
     }
 
-    /* captures[0] will be entire matched string, [1] first substring */
-    if (!pcre_get_substring_list(arg, ovec, captured_count, (const char ***)&captures)) {
-	int nelem = arrlen(captures)-1;
+    /* ovec[0] will be entire matched string, [1] first substring */
+    ovec = pcre2_get_ovector_pointer(mdata);
+    if (ovec) {
+	int nelem = captured_count - 1;
 	/* Set to the offsets of the complete match */
 	if (want_offset_pair) {
-	    sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+	    sprintf(offset_all, "%ld %ld", ovec[0], ovec[1]);
 	    setsparam("ZPCRE_OP", ztrdup(offset_all));
 	}
 	/*
@@ -186,7 +158,7 @@ zpcre_get_substrings(char *arg, int *ove
 	 * ovec is length 2*(1+capture_list_length)
 	 */
 	if (matchvar) {
-	    match_all = metafy(captures[0], ovec[1] - ovec[0], META_DUP);
+	    match_all = metafy(arg + ovec[0], ovec[1] - ovec[0], META_DUP);
 	    setsparam(matchvar, match_all);
 	}
 	/*
@@ -201,16 +173,12 @@ zpcre_get_substrings(char *arg, int *ove
 	 */
 	if (substravar &&
 	    (!want_begin_end || nelem)) {
-	    char **x, **y;
+	    char **x;
 	    int vec_off, i;
-	    y = &captures[capture_start];
 	    matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
-	    for (i = capture_start; i < captured_count; i++, y++) {
+	    for (i = capture_start; i < captured_count; i++) {
 		vec_off = 2*i;
-		if (*y)
-		    *x++ = metafy(*y, ovec[vec_off+1]-ovec[vec_off], META_DUP);
-		else
-		    *x++ = NULL;
+		*x++ = metafy(arg + ovec[vec_off], ovec[vec_off+1]-ovec[vec_off], META_DUP);
 	    }
 	    *x = NULL;
 	    setaparam(substravar, matches);
@@ -247,7 +215,8 @@ zpcre_get_substrings(char *arg, int *ove
 	    setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
 	    if (nelem) {
 		char **mbegin, **mend, **bptr, **eptr;
-		int i, *ipair;
+		int i;
+		size_t *ipair;
 
 		bptr = mbegin = zalloc(sizeof(char*)*(nelem+1));
 		eptr = mend = zalloc(sizeof(char*)*(nelem+1));
@@ -287,8 +256,6 @@ zpcre_get_substrings(char *arg, int *ove
 		setaparam("mend", mend);
 	    }
 	}
-
-	pcre_free_substring_list((const char **)captures);
     }
 
     return 0;
@@ -314,7 +281,8 @@ getposint(char *instr, char *nam)
 static int
 bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
 {
-    int ret, capcount, *ovec, ovecsize, c;
+    int ret, c;
+    pcre2_match_data *pcre_mdata = NULL;
     char *matched_portion = NULL;
     char *plaintext = NULL;
     char *receptacle = NULL;
@@ -344,36 +312,30 @@ bin_pcre_match(char *nam, char **args, O
     /* For the entire match, 'Return' the offset byte positions instead of the matched string */
     if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
 
-    if ((ret = pcre_fullinfo(pcre_pattern, pcre_hints, PCRE_INFO_CAPTURECOUNT, &capcount)))
-    {
-	zwarnnam(nam, "error %d in fullinfo", ret);
-	return 1;
-    }
-
-    ovecsize = (capcount+1)*3;
-    ovec = zalloc(ovecsize*sizeof(int));
-
     plaintext = ztrdup(*args);
     unmetafy(plaintext, &subject_len);
 
     if (offset_start > 0 && offset_start >= subject_len)
-	ret = PCRE_ERROR_NOMATCH;
-    else
-	ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
+	ret = PCRE2_ERROR_NOMATCH;
+    else {
+	pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
+	ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
+		offset_start, 0, pcre_mdata, NULL);
+    }
 
     if (ret==0) return_value = 0;
-    else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
+    else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
+	zpcre_get_substrings(plaintext, pcre_mdata, ret, matched_portion, receptacle,
 			     want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
-	zwarnnam(nam, "error in pcre_exec [%d]", ret);
+	zwarnnam(nam, "error in pcre2_match [%d]", ret);
     }
     
-    if (ovec)
-	zfree(ovec, ovecsize*sizeof(int));
+    if (pcre_mdata)
+	pcre2_match_data_free(pcre_mdata);
     zsfree(plaintext);
 
     return return_value;
@@ -383,17 +345,19 @@ bin_pcre_match(char *nam, char **args, O
 static int
 cond_pcre_match(char **a, int id)
 {
-    pcre *pcre_pat;
-    const char *pcre_err;
+    pcre2_code *pcre_pat = NULL;
+    int pcre_err;
+    PCRE2_SIZE pcre_erroff;
     char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar, *svar;
-    int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
+    int r = 0, pcre_opts = 0;
+    pcre2_match_data *pcre_mdata = NULL;
     int lhstr_plain_len, rhre_plain_len;
     int return_value = 0;
 
     if (zpcre_utf8_enabled())
-	pcre_opts |= PCRE_UTF8;
+	pcre_opts |= PCRE2_UTF;
     if (isset(REMATCHPCRE) && !isset(CASEMATCH))
-	pcre_opts |= PCRE_CASELESS;
+	pcre_opts |= PCRE2_CASELESS;
 
     lhstr = cond_str(a,0,0);
     rhre = cond_str(a,1,0);
@@ -401,9 +365,6 @@ cond_pcre_match(char **a, int id)
     rhre_plain = ztrdup(rhre);
     unmetafy(lhstr_plain, &lhstr_plain_len);
     unmetafy(rhre_plain, &rhre_plain_len);
-    pcre_pat = NULL;
-    ov = NULL;
-    ovsize = 0;
 
     if (isset(BASHREMATCH)) {
 	svar = NULL;
@@ -415,27 +376,27 @@ cond_pcre_match(char **a, int id)
 
     switch(id) {
 	 case CPCRE_PLAIN:
-		if ((int)strlen(rhre_plain) != rhre_plain_len) {
-		    zwarn("embedded NULs in PCRE pattern terminate pattern");
-		}
-		pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
-		if (pcre_pat == NULL) {
-		    zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
+		if (!(pcre_pat = pcre2_compile((PCRE2_SPTR) rhre_plain,
+			(PCRE2_SIZE) rhre_plain_len, pcre_opts,
+			&pcre_err, &pcre_erroff, NULL)))
+		{
+		    PCRE2_UCHAR buffer[256];
+		    pcre2_get_error_message(pcre_err, buffer, sizeof(buffer));
+		    zwarn("failed to compile regexp /%s/: %s", rhre, buffer);
 		    break;
 		}
-                pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
-    		ovsize = (capcnt+1)*3;
-		ov = zalloc(ovsize*sizeof(int));
-    		r = pcre_exec(pcre_pat, NULL, lhstr_plain, lhstr_plain_len, 0, 0, ov, ovsize);
-		/* r < 0 => error; r==0 match but not enough size in ov
+		pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pat, NULL);
+		r = pcre2_match(pcre_pat, (PCRE2_SPTR8) lhstr_plain, lhstr_plain_len,
+			0, 0, pcre_mdata, NULL);
+		/* r < 0 => error; r==0 match but not enough size in match data
 		 * r > 0 => (r-1) substrings found; r==1 => no substrings
 		 */
     		if (r==0) {
-		    zwarn("reportable zsh problem: pcre_exec() returned 0");
+		    zwarn("reportable zsh problem: pcre2_match() returned 0");
 		    return_value = 1;
 		    break;
 		}
-	        else if (r==PCRE_ERROR_NOMATCH) {
+		else if (r == PCRE2_ERROR_NOMATCH) {
 		    return_value = 0; /* no match */
 		    break;
 		}
@@ -444,7 +405,7 @@ cond_pcre_match(char **a, int id)
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr_plain, ov, r, svar, avar, 0,
+		    zpcre_get_substrings(lhstr_plain, pcre_mdata, r, svar, avar, 0,
 					 isset(BASHREMATCH),
 					 !isset(BASHREMATCH));
 		    return_value = 1;
@@ -457,10 +418,10 @@ cond_pcre_match(char **a, int id)
 	free(lhstr_plain);
     if(rhre_plain)
 	free(rhre_plain);
+    if (pcre_mdata)
+	pcre2_match_data_free(pcre_mdata);
     if (pcre_pat)
-	pcre_free(pcre_pat);
-    if (ov)
-	zfree(ov, ovsize*sizeof(int));
+	pcre2_code_free(pcre_pat);
 
     return return_value;
 }
@@ -489,11 +450,11 @@ static struct builtin bintab[] = {
 
 static struct features module_features = {
     bintab, sizeof(bintab)/sizeof(*bintab),
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
     cotab, sizeof(cotab)/sizeof(*cotab),
-#else /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#else /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
     NULL, 0,
-#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#endif /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
     NULL, 0,
     NULL, 0,
     0
@@ -540,19 +501,9 @@ cleanup_(Module m)
 int
 finish_(UNUSED(Module m))
 {
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#ifdef HAVE_PCRE_STUDY
-    if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
-	pcre_free_study(pcre_hints);
-#else
-	pcre_free(pcre_hints);
-#endif
-    pcre_hints = NULL;
-#endif
-
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
     if (pcre_pattern)
-	pcre_free(pcre_pattern);
+	pcre2_code_free(pcre_pattern);
     pcre_pattern = NULL;
 #endif
 
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -129,12 +129,17 @@
 >78884; ZPCRE_OP: 25 30
 >90210; ZPCRE_OP: 31 36
 
-# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
+# Embedded NULs allowed in plaintext, in RE, pcre supports \0 as two-chars
   [[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
   print "${#MATCH}; ${#match[1]}; ${#match[2]}"
 0:ensure ASCII NUL passes in and out of matched plaintext
 >6; 3; 3
 
+# PCRE2 supports NULs also in the RE
+  [[ $'a\0b\0c' =~ $'^(.\0)+' ]] && print "${#MATCH}; ${#match[1]}"
+0:ensure ASCII NUL works also in the regex
+>4; 2
+
 # Ensure the long-form infix operator works
   [[ foo -pcre-match ^f..$ ]]
   print $?
@@ -181,7 +186,11 @@
   [[ é =~ '^..\z' ]]; echo $?
   LANG=$LANG_SAVE
   [[ é =~ '^.\z' ]]; echo $?
-0:swich between C/UTF-8 locales
+0:switch between C/UTF-8 locales
 >0
 >0
 >0
+
+  [[ abc =~ 'a(d*)bc' ]] && print "$#MATCH; $#match; ${#match[1]}"
+0:empty capture
+>3; 1; 0
--- a/configure.ac
+++ b/configure.ac
@@ -438,7 +438,7 @@ fi],
 
 dnl Do you want to look for pcre support?
 AC_ARG_ENABLE(pcre,
-AS_HELP_STRING([--enable-pcre],[enable the search for the pcre library (may create run-time library dependencies)]))
+AS_HELP_STRING([--enable-pcre],[enable the search for the pcre2 library (may create run-time library dependencies)]))
 
 dnl Do you want to look for capability support?
 AC_ARG_ENABLE(cap,
@@ -662,13 +662,12 @@ AC_HEADER_SYS_WAIT
 
 oldcflags="$CFLAGS"
 if test x$enable_pcre = xyes; then
-AC_CHECK_PROG([PCRECONF], pcre-config, pcre-config)
-dnl Typically (meaning on this single RedHat 9 box in front of me)
-dnl pcre-config --cflags produces a -I output which needs to go into
+AC_CHECK_PROG([PCRECONF], pcre2-config, pcre2-config)
+dnl pcre2-config --cflags may produce a -I output which needs to go into
 dnl CPPFLAGS else configure's preprocessor tests don't pick it up,
 dnl producing a warning.
-if test "x$ac_cv_prog_PCRECONF" = xpcre-config; then
-  CPPFLAGS="$CPPFLAGS `pcre-config --cflags`"
+if test "x$ac_cv_prog_PCRECONF" = xpcre2-config; then
+  CPPFLAGS="$CPPFLAGS `pcre2-config --cflags`"
 fi
 fi
 
@@ -678,9 +677,10 @@ AC_CHECK_HEADERS(sys/time.h sys/times.h
 		 locale.h errno.h stdio.h stdarg.h varargs.h stdlib.h \
 		 unistd.h sys/capability.h \
 		 utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \
-		 netinet/in_systm.h pcre.h langinfo.h wchar.h stddef.h \
+		 netinet/in_systm.h langinfo.h wchar.h stddef.h \
 		 sys/stropts.h iconv.h ncurses.h ncursesw/ncurses.h \
 		 ncurses/ncurses.h)
+AC_CHECK_HEADERS([pcre2.h],,,[#define PCRE2_CODE_UNIT_WIDTH 8])
 if test x$dynamic = xyes; then
   AC_CHECK_HEADERS(dlfcn.h)
   AC_CHECK_HEADERS(dl.h)
@@ -958,9 +958,7 @@ if test "x$ac_found_iconv" = "xyes"; the
 fi
 
 if test x$enable_pcre = xyes; then
-dnl pcre-config should probably be employed here
-dnl AC_SEARCH_LIBS(pcre_compile, pcre)
-  LIBS="`$ac_cv_prog_PCRECONF --libs` $LIBS"
+  LIBS="`$ac_cv_prog_PCRECONF --libs8` $LIBS"
 fi
 
 dnl ---------------------
@@ -1323,7 +1321,7 @@ AC_CHECK_FUNCS(strftime strptime mktime
 	       pathconf sysconf \
 	       tgetent tigetflag tigetnum tigetstr setupterm initscr resize_term \
 	       getcchar setcchar waddwstr wget_wch win_wch use_default_colors \
-	       pcre_compile pcre_study pcre_exec \
+	       pcre2_compile_8 \
 	       nl_langinfo \
 	       erand48 open_memstream \
 	       posix_openpt \
